library(Matrix)
library(tidyverse)
library(ggplot2)
library(ggrepel)
library(data.table)
library(Seurat)
library(AnnotationHub)
library(cowplot)
library(ensembldb)
# Set species and DB namedb
organism <- "Homo sapiens"
DB <- "EnsDb" # e.g. EnsDb, OrgDb, etc
# Connect to AnnotationHub
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))
# Bring the Ensembl DB
anno.db <- query(ah,
pattern=c(organism, DB),
ignore.case=T)
# Find the id from most recent annotation data
id <- mcols(anno.db) %>%
rownames() %>%
tail(1)
# Download the most recent annotation table
id.db <- ah[[id]]
# Extract gene-level info and save as a data frame
gene.db <- genes(id.db, return.type="data.frame")
# Assign your file paths
barcode.path <- "barcodes.tsv.gz" # cell ids
features.path <- "features.tsv.gz" # gene ids
matrix.path <- "matrix.mtx.gz" # counts
# Import the files
# readMM() turns into a sparse matrix (which only stores non-zero values)
# in order to minimize data size
mat <- readMM(file=matrix.path)
feature.names <- read.delim(features.path,
header=FALSE,
stringsAsFactors=FALSE)
barcode.names <- read.delim(barcode.path,
header=FALSE,
stringsAsFactors=FALSE)
# Assign row/column names
colnames(mat) <- barcode.names$V1
rownames(mat) <- feature.names$V2
# Create a seurat object
seurat <- CreateSeuratObject(counts=mat, # accepts a sparse matrix
min.features=100) # see above note
# Explore the metadata (see above description)
head(seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401
## AAACGAAGTATGTCCA-1 SeuratProject 1548 453
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609
# In case you have multiple samples to create a seurat object:
#
# for (file in c("ctrl_raw_feature_bc_matrix", "stim_raw_feature_bc_matrix")){
#
# seurat_data <- Read10X(data.dir = paste0("data/", file))
# seurat_obj <- CreateSeuratObject(counts = seurat_data,
# min.features = 100,
# project = file)
# assign(file, seurat_obj)
# }
#
# Create a merged Seurat object
# merged_seurat <- merge(x = ctrl_raw_feature_bc_matrix,
# y = stim_raw_feature_bc_matrix,
# add.cell.id = c("ctrl", "stim"))
# Check that the merged object has the appropriate sample-specific prefixes
# head(merged_seurat@meta.data)
# tail(merged_seurat@meta.data)
# Add number of genes per UMI for each cell to metadata
seurat$log10GenesPerUMI <- log10(seurat$nFeature_RNA) / log10(seurat$nCount_RNA)
# Explore the updated metadata
head(seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA log10GenesPerUMI
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 0.8651493
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 0.8753335
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 0.9026281
## AAACGAAGTATGTCCA-1 SeuratProject 1548 453 0.8326925
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829 0.8851977
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 0.9076876
# Compute percent mito ratio
# NOTE: "^MT-" is for human genes. Adjust it depending on your organism of interest.
# Calculate manually as instructed above if you don't have available identifier patterns
seurat$mitoRatio <- PercentageFeatureSet(object=seurat,
pattern="^MT-")
# Convert the ratio to percentage
seurat$mitoRatio <- seurat@meta.data$mitoRatio / 100
# Explore the updated metadata
head(seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA log10GenesPerUMI
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 0.8651493
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 0.8753335
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 0.9026281
## AAACGAAGTATGTCCA-1 SeuratProject 1548 453 0.8326925
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829 0.8851977
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 0.9076876
## mitoRatio
## AAACGAAAGATCCCAT-1 0.03880252
## AAACGAACAGGCTACC-1 0.03175947
## AAACGAAGTACAAGCG-1 0.08428618
## AAACGAAGTATGTCCA-1 0.20413437
## AAACGAATCCTGCTAC-1 0.02930857
## AAACGAATCTCTAAGG-1 0.05303678
# Create a dataframe from the metadata
meta <- seurat@meta.data
# Add cell ids to the data frame
meta$cells <- rownames(meta)
meta <- meta %>%
separate(cells, c("Barcode", "Sample"), sep="-")
# Rename some of the columns
old.colnames <- colnames(meta)
colnames(meta) <- c("seq.folder", "nUMI", "nGene",
old.colnames[4:length(old.colnames)])
# Add Tissue to the data frame
meta$Tissue <- ifelse(meta$Sample %in% c("1", "3", "5"),
"Atherosclerotic Core",
"Proximal Adjacent")
# Explore the updated data frame
head(meta)
## seq.folder nUMI nGene log10GenesPerUMI mitoRatio
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 0.8651493 0.03880252
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 0.8753335 0.03175947
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 0.9026281 0.08428618
## AAACGAAGTATGTCCA-1 SeuratProject 1548 453 0.8326925 0.20413437
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829 0.8851977 0.02930857
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 0.9076876 0.05303678
## Barcode Sample Tissue
## AAACGAAAGATCCCAT-1 AAACGAAAGATCCCAT 1 Atherosclerotic Core
## AAACGAACAGGCTACC-1 AAACGAACAGGCTACC 1 Atherosclerotic Core
## AAACGAAGTACAAGCG-1 AAACGAAGTACAAGCG 1 Atherosclerotic Core
## AAACGAAGTATGTCCA-1 AAACGAAGTATGTCCA 1 Atherosclerotic Core
## AAACGAATCCTGCTAC-1 AAACGAATCCTGCTAC 1 Atherosclerotic Core
## AAACGAATCTCTAAGG-1 AAACGAATCTCTAAGG 1 Atherosclerotic Core
# Update the seurat object with the updated metadata
seurat@meta.data <- meta
# Explore the updated metadata
head(seurat@meta.data)
## seq.folder nUMI nGene log10GenesPerUMI mitoRatio
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 0.8651493 0.03880252
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 0.8753335 0.03175947
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 0.9026281 0.08428618
## AAACGAAGTATGTCCA-1 SeuratProject 1548 453 0.8326925 0.20413437
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829 0.8851977 0.02930857
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 0.9076876 0.05303678
## Barcode Sample Tissue
## AAACGAAAGATCCCAT-1 AAACGAAAGATCCCAT 1 Atherosclerotic Core
## AAACGAACAGGCTACC-1 AAACGAACAGGCTACC 1 Atherosclerotic Core
## AAACGAAGTACAAGCG-1 AAACGAAGTACAAGCG 1 Atherosclerotic Core
## AAACGAAGTATGTCCA-1 AAACGAAGTATGTCCA 1 Atherosclerotic Core
## AAACGAATCCTGCTAC-1 AAACGAATCCTGCTAC 1 Atherosclerotic Core
## AAACGAATCTCTAAGG-1 AAACGAATCTCTAAGG 1 Atherosclerotic Core
# Plot the number of cells per sample
ggplot(meta, aes(x=Sample, fill=Tissue)) +
geom_bar() +
theme_bw() +
ggtitle("QC: Number of Cells per Sample") +
ylab("Number")
# Plot the number of transcripts per cell
ggplot(meta, aes(x=nUMI, fill=Sample, color=Sample)) +
geom_density(alpha=0.2) +
facet_grid(~Tissue) +
scale_x_log10() +
theme_bw() +
theme(strip.text.x=element_text(size=10)) +
ggtitle("QC: UMI Counts (Transcripts) per Cell") +
ylab("log10 Cell Density") +
geom_vline(xintercept=500, linetype="dashed")
# Plot the number of genes detected per cell
ggplot(meta, aes(x=nGene, color=Sample, fill=Sample)) +
geom_density(alpha=0.2) +
facet_grid(~Tissue) +
scale_x_log10() +
theme_bw() +
ylab("log10 Cell Density") +
theme(strip.text.x=element_text(size=10)) +
ggtitle("QC: Number of Genes Detected per Cell") +
geom_vline(xintercept=300, linetype="dashed")
ggplot(meta, aes(x=Sample, y=nGene, fill=Tissue)) +
geom_boxplot(outlier.alpha=0.5) +
theme_bw() +
scale_y_log10() +
ylab("log10 Number of Genes") +
ggtitle("QC: NCells vs NGenes")
# Plot nUMIs vs nGenes per cell along with mitoRatio
ggplot(meta, aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point(alpha=0.5) +
scale_color_gradient(low="gray90", high="black") +
geom_smooth(method="lm", color="blue", se=F) +
facet_grid(Tissue~Sample) +
theme_bw() +
theme(strip.text.x=element_text(size=10)) +
geom_vline(xintercept=500, color="red") +
geom_hline(yintercept=250, color="red") +
xlab("Number of UMIs per Cell") +
ylab("Number of Genes per Cell") +
ggtitle("Number of UMIs vs Genes per Cell")
# Plot proportion of mitochondrial counts
ggplot(meta, aes(x=mitoRatio, color=Sample, fill=Sample)) +
geom_density(alpha=0.2) +
theme_bw() +
scale_x_log10() +
geom_vline(xintercept=0.2) +
xlab("Proportion of Mitochondrial Counts Detected") +
ylab("Cell Density") +
ggtitle("QC: Proportion of Mitochondrial Counts")
# Plot the overall complexity of the gene expression by visualizing the genes detected per UMI
ggplot(meta, aes(x=log10GenesPerUMI, color=Sample, fill=Sample)) +
geom_density(alpha=0.2) +
theme_bw() +
facet_grid(~Tissue) +
theme(strip.text.x=element_text(size=10)) +
geom_vline(xintercept=0.8, linetype="dashed") +
ggtitle("QC: Complexity of Gene Expression") +
ylab("log10 Cell Density")
# Filter out low quality cells
f.seurat <- subset(x=seurat,
subset=(nUMI >= 500) &
(nGene >= 250) &
(log10GenesPerUMI > 0.8) &
(mitoRatio < 0.2))
# Compare seurat objects before and after cell-level filtering
seurat # Before filtering
## An object of class Seurat
## 33538 features across 51851 samples within 1 assay
## Active assay: RNA (33538 features, 0 variable features)
f.seurat # After cell-level filtering
## An object of class Seurat
## 33538 features across 48236 samples within 1 assay
## Active assay: RNA (33538 features, 0 variable features)
# Assign the minimum counts (User-defined!)
count.threshold <- 10
# Extract counts (turns into a parse matrix)
counts <- GetAssayData(object=f.seurat, slot="counts")
# Create a logical matrix identifying non-zero count genes
nonzero <- counts > 0
# Create a logical vector identifying genes with counts over threshold
keep <- rowSums(nonzero) >= count.threshold
# Filter genes with the logical vector
counts <- counts[keep,]
# Create a new seurat object with the filtered count matrix
f.seurat <- CreateSeuratObject(counts,
meta.data=f.seurat@meta.data)
# Compare the seurat objects before and after filtering
seurat # Before filtering
## An object of class Seurat
## 33538 features across 51851 samples within 1 assay
## Active assay: RNA (33538 features, 0 variable features)
f.seurat # After cell- and gene-level filtering
## An object of class Seurat
## 20561 features across 48236 samples within 1 assay
## Active assay: RNA (20561 features, 0 variable features)
# Extract metadata after filtering
new.meta <- f.seurat@meta.data
# Add a column assigning before or after filtering
meta$Filtering <- "Before Filtering"
new.meta$Filtering <- "After Filtering"
# Remove duplicated columns
new.meta <- new.meta[, colnames(new.meta) %in% colnames(meta)]
# Combine the metadata before and after filtering
both.meta <- rbind(meta, new.meta)
both.meta$Filtering <- factor(both.meta$Filtering,
levels=c("Before Filtering", "After Filtering"))
# Create a bar plot comparing number of cells per sample before and after filtering
ggplot(both.meta, aes(x=Sample, fill=Tissue)) +
geom_bar() +
facet_grid(~Filtering) +
theme_bw() +
theme(strip.text.x=element_text(size=10)) +
ggtitle("QC: Number of Cells per Sample (before & after Filtering)") +
ylab("Number")
# Plot the number of transcripts per cell before and after filtering
ggplot(both.meta, aes(x=nUMI, fill=Sample, color=Sample)) +
geom_density(alpha=0.2) +
scale_x_log10() +
theme_bw() +
facet_grid(Tissue~Filtering) +
theme(strip.text.x=element_text(size=10)) +
ggtitle("QC: UMI Counts (Transcripts) per Cell (before & after Filtering)") +
ylab("log10 Cell Density") +
geom_vline(xintercept=500, linetype="dashed")
# Plot the number of genes detected per cell before and after filtering
# Density plot
ggplot(both.meta, aes(x=nGene, color=Sample, fill=Sample)) +
geom_density(alpha=0.2) +
scale_x_log10() +
theme_bw() +
facet_grid(~Filtering) +
theme(strip.text.x=element_text(size=10)) +
ylab("log10 Cell Density") +
ggtitle("QC: Number of Genes Detected per Cell (before & after Filtering)") +
geom_vline(xintercept=300, linetype="dashed")
# Box plot
ggplot(both.meta, aes(x=Sample, y=nGene, fill=Tissue)) +
geom_boxplot(outlier.alpha=0.5) +
theme_bw() +
facet_grid(~Filtering) +
theme(strip.text.x=element_text(size=10)) +
scale_y_log10() +
ylab("log10 Number of Genes") +
ggtitle("QC: NCells vs NGenes (before and after Filtering)")
# Plot nUMIs vs nGenes per cell along with mitoRatio (before and after filtering)
ggplot(both.meta, aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point(alpha=0.5) +
scale_color_gradient(low="gray90", high="black") +
geom_smooth(method="lm", color="blue", se=F) +
facet_grid(Tissue~Filtering) +
theme_bw() +
theme(strip.text=element_text(size=10)) +
geom_vline(xintercept=500, color="red") +
geom_hline(yintercept=250, color="red") +
xlab("Number of UMIs per Cell") +
ylab("Number of Genes per Cell") +
ggtitle("Number of UMIs vs Genes per Cell (before & after Filtering)")
# NOTE: Given the fact that this raw data already had mitoRatio close to 0, mitoRatio comparison is skipped
# Plot the overall complexity of the gene expression by visualizing the genes detected per UMI (before and after filtering)
ggplot(both.meta, aes(x=log10GenesPerUMI, color=Sample, fill=Sample)) +
geom_density(alpha=0.2) +
theme_bw() +
facet_grid(Tissue~Filtering) +
theme(strip.text.x=element_text(size=10)) +
geom_vline(xintercept=0.8, linetype="dashed") +
ggtitle("QC: Complexity of Gene Expression (before and after Filtering)") +
ylab("log10 Cell Density")
# NOTE: This chunk has to be written by users!!!
#
#
#
# Import a dataset storing cell cycle genes
cc.genes <- read_csv("https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Homo_sapiens.csv")
# Combine two data frames
cc.genes <- inner_join(cc.genes, gene.db, by=c("geneID"="gene_id"))
# Create a vector storing S phase genes
s.genes <- unique(subset(cc.genes, phase=="S")$gene_name)
# Create a vector storing G2M phase genes
g2m.genes <- unique(subset(cc.genes, phase=="G2/M")$gene_name)
# Explore the outputs
head(s.genes)
## [1] "UBR7" "RFC2" "RAD51" "MCM2" "TIPIN" "MCM6"
length(s.genes)
## [1] 43
head(g2m.genes)
## [1] "NCAPD2" "ANLN" "TACC3" "HMMR" "GTSE1" "NDC80"
length(g2m.genes)
## [1] 54
# Normalize the counts
ccycle.seurat <- NormalizeData(f.seurat)
# Calculate cell cycle scores using CellCycleScoreing() function
ccycle.seurat <- CellCycleScoring(ccycle.seurat,
g2m.features=g2m.genes,
s.features=s.genes)
# Explore the updated metadata
head(ccycle.seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA seq.folder nUMI
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 SeuratProject 7783
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 SeuratProject 4723
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 SeuratProject 3061
## AAACGAATCCTGCTAC-1 SeuratProject 4845 1829 SeuratProject 4845
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 SeuratProject 1169
## AAACGCTAGCTGGCTC-1 SeuratProject 2064 824 SeuratProject 2064
## nGene log10GenesPerUMI mitoRatio Barcode Sample
## AAACGAAAGATCCCAT-1 2325 0.8651493 0.03880252 AAACGAAAGATCCCAT 1
## AAACGAACAGGCTACC-1 1645 0.8753335 0.03175947 AAACGAACAGGCTACC 1
## AAACGAAGTACAAGCG-1 1401 0.9026281 0.08428618 AAACGAAGTACAAGCG 1
## AAACGAATCCTGCTAC-1 1829 0.8851977 0.02930857 AAACGAATCCTGCTAC 1
## AAACGAATCTCTAAGG-1 609 0.9076876 0.05303678 AAACGAATCTCTAAGG 1
## AAACGCTAGCTGGCTC-1 824 0.8796931 0.11870155 AAACGCTAGCTGGCTC 1
## Tissue S.Score G2M.Score Phase
## AAACGAAAGATCCCAT-1 Atherosclerotic Core -0.029714846 -0.031268699 G1
## AAACGAACAGGCTACC-1 Atherosclerotic Core -0.057569844 -0.079933374 G1
## AAACGAAGTACAAGCG-1 Atherosclerotic Core -0.009931357 0.004737825 G2M
## AAACGAATCCTGCTAC-1 Atherosclerotic Core 0.024042318 0.001259567 S
## AAACGAATCTCTAAGG-1 Atherosclerotic Core -0.041629408 -0.022850511 G1
## AAACGCTAGCTGGCTC-1 Atherosclerotic Core -0.038694802 -0.042638380 G1
# Examine variation from cell cycle
if ("Phase" %in% colnames(ccycle.seurat@meta.data)) {
# Identify the most variable genes
ccycle.seurat <- FindVariableFeatures(ccycle.seurat,
selection.method="vst",
nfeatures=2000,
verbose=T)
# Scale the data (to mean = 0, variance = 1)
ccycle.seurat <- ScaleData(ccycle.seurat)
# Run PCA
ccycle.seurat <- RunPCA(ccycle.seurat)
# Plot the PCA result
DimPlot(ccycle.seurat,
reduction="pca",
group.by="Phase",
split.by="Phase")
} else {
print("No cell cycle effects were found")
}
# Run below in case your dataset has variations to regress out and a single group/condition
# ccycle.seurat <- SCTransform(ccycle.seurat,
# vars.to.regress=c("mitoRatio"))
# Set a condition to split
Condition <- "Tissue"
# Split the filtered seurat object by condition
split.seurat <- SplitObject(f.seurat, split.by=Condition)
options(future.globals.maxSize=4000 * 1024^2)
# Iterately run NormalizeData(), CellCycleScoring(), and SCTransform()
for (i in 1:length(split.seurat)) {
# Normalize
split.seurat[[i]] <- NormalizeData(split.seurat[[i]], verbose=T)
# Score cells for cell cycle
split.seurat[[i]] <- CellCycleScoring(split.seurat[[i]],
g2m.features=g2m.genes,
s.features=s.genes)
# Perform SCTransformation: will rank the genes by residual variance and output
# the 3000 most variant genes by default.
# Set a higher value than 3000 to variable.features.n if your dataset is
# extra-large.
split.seurat[[i]] <- SCTransform(split.seurat[[i]],
vars.to.regress=c("mitoRatio",
"S.Score",
"G2M.Score"))
}
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======= | 11%
|
|========= | 13%
|
|=========== | 16%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 39%
|
|============================= | 42%
|
|=============================== | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 55%
|
|========================================= | 58%
|
|========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|====== | 8%
|
|======= | 11%
|
|========= | 13%
|
|=========== | 16%
|
|============= | 18%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 39%
|
|============================= | 42%
|
|=============================== | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 55%
|
|========================================= | 58%
|
|========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 17%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 17%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
# Set default assay to the integrated assay
DefaultAssay(integ.seurat) <- "integrated"
# Scale the integrated data
integ.seurat <- ScaleData(integ.seurat)
# Run PCA
integ.seurat <- RunPCA(integ.seurat)
# Present PCA
pca.1 <- DimPlot(integ.seurat,
reduction="pca",
group.by=Condition) +
ggtitle(paste("PCA by", Condition))
pca.2 <- DimPlot(integ.seurat,
reduction="pca",
group.by=Condition,
split.by=Condition) +
ggtitle(paste("PCA by", Condition))
pca.1 + pca.2
# Run UMAP
integ.seurat <- RunUMAP(integ.seurat, dims=1:30, reduction="pca")
# Present UMAP
umap.1 <- DimPlot(integ.seurat,
reduction="umap",
group.by=Condition) + ggtitle(paste("UMAP by", Condition))
umap.2 <- DimPlot(integ.seurat,
reduction="umap", group.by=Condition,
split.by=Condition) + ggtitle(paste("UMAP by", Condition))
umap.1 + umap.2
#### TEMPORARILY NOT EXECUTABLE DUE TO COMPATIBILITY ISSUES IN SEURAT 4.0.0 ####
# integ.seurat <- JackStraw(integ.seurat, num.replicate=100)
# integ.seurat <- ScoreJackStraw(integ.seurat, dims=1:20)
# JackStrawPlot(integ.seurat, dims=1:20)
ElbowPlot(integ.seurat, ndims=50)
DimHeatmap(integ.seurat,
dims=1:15,
cells=500,
balanced=T)
print(integ.seurat[["pca"]],
dims=1:15,
nfeatures=5)
## PC_ 1
## Positive: IGFBP7, SELENOM, ADIRF, MGP, CD151
## Negative: CYBA, TYROBP, FCER1G, SRGN, AIF1
## PC_ 2
## Positive: NPC2, AIF1, CST3, TYROBP, FCER1G
## Negative: IL32, RPS27, RPS29, CD3D, CD2
## PC_ 3
## Positive: RAMP2, ECSCR, CALCRL, PECAM1, CLEC14A
## Negative: TAGLN, ACTA2, TPM2, MYL9, FXYD1
## PC_ 4
## Positive: MYH11, MYL9, ACTA2, PLN, TPM2
## Negative: LUM, COL1A2, CFH, DCN, COL3A1
## PC_ 5
## Positive: C1QB, C1QA, C1QC, SELENOP, HLA-DQA1
## Negative: SPP1, FABP5, SH3BGRL3, S100A10, CD36
## PC_ 6
## Positive: GNG11, FABP4, SPRY1, CXorf36, APOE
## Negative: EDN1, BMX, ITLN1, PROCR, ELN
## PC_ 7
## Positive: CD79A, MS4A1, BANK1, RPL13, RPS18
## Negative: CCL4, NKG7, CCL5, CTSD, LGMN
## PC_ 8
## Positive: TPSAB1, CPA3, TPSB2, MS4A2, HPGDS
## Negative: S100A8, S100A9, FCN1, LYZ, SERPINA1
## PC_ 9
## Positive: TPSAB1, CPA3, TPSB2, MS4A2, HDC
## Negative: CD79A, MS4A1, BANK1, IGHM, RALGPS2
## PC_ 10
## Positive: IL7R, TPT1, MAL, LEF1, CD40LG
## Negative: NKG7, CTSW, KLRD1, GNLY, PRF1
## PC_ 11
## Positive: MYH11, SVIL, FLNA, PLN, DST
## Negative: FRZB, FOS, ZFP36, DLX5, DLX6-AS1
## PC_ 12
## Positive: SUCNR1, SOST, DLX6-AS1, IGFBP6, PDE5A
## Negative: CCDC102B, STEAP4, COX4I2, LHFPL6, JUN
## PC_ 13
## Positive: KLF6, DUSP1, JUNB, IGKC, IER2
## Negative: TPT1, EEF1A1, RPS12, RPL10, RPS18
## PC_ 14
## Positive: CCL14, MMRN1, ACKR1, BMP4, DNASE1L3
## Negative: SLC9A3R2, PODXL, PTP4A3, GJA4, CXCL12
## PC_ 15
## Positive: PTGDS, SFRP2, DPT, RARRES2, CXCL12
## Negative: GZMK, EFEMP1, NDUFA4L2, WFDC1, FCER1A
# Based on the elbow plot and heatmap, 14 dims will be used for clustering
# in the next section!
# Find neighbors using KNN
integ.seurat <- FindNeighbors(integ.seurat, dims=1:13)
# Explore the clustering outputs
head(integ.seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA seq.folder nUMI
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 SeuratProject 7783
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 SeuratProject 4723
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 SeuratProject 3061
## AAACGAATCCTGCTAC-1 SeuratProject 4844 1828 SeuratProject 4845
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 SeuratProject 1169
## AAACGCTAGCTGGCTC-1 SeuratProject 2063 823 SeuratProject 2064
## nGene log10GenesPerUMI mitoRatio Barcode Sample
## AAACGAAAGATCCCAT-1 2325 0.8651493 0.03880252 AAACGAAAGATCCCAT 1
## AAACGAACAGGCTACC-1 1645 0.8753335 0.03175947 AAACGAACAGGCTACC 1
## AAACGAAGTACAAGCG-1 1401 0.9026281 0.08428618 AAACGAAGTACAAGCG 1
## AAACGAATCCTGCTAC-1 1829 0.8851977 0.02930857 AAACGAATCCTGCTAC 1
## AAACGAATCTCTAAGG-1 609 0.9076876 0.05303678 AAACGAATCTCTAAGG 1
## AAACGCTAGCTGGCTC-1 824 0.8796931 0.11870155 AAACGCTAGCTGGCTC 1
## Tissue S.Score G2M.Score Phase
## AAACGAAAGATCCCAT-1 Atherosclerotic Core -0.01696573 -0.024925519 G1
## AAACGAACAGGCTACC-1 Atherosclerotic Core -0.03994105 -0.083265170 G1
## AAACGAAGTACAAGCG-1 Atherosclerotic Core -0.00665869 0.006498585 G2M
## AAACGAATCCTGCTAC-1 Atherosclerotic Core 0.02856264 0.014279768 S
## AAACGAATCTCTAAGG-1 Atherosclerotic Core -0.04324047 -0.014766047 G1
## AAACGCTAGCTGGCTC-1 Atherosclerotic Core -0.02271544 -0.024327827 G1
## nCount_SCT nFeature_SCT
## AAACGAAAGATCCCAT-1 4853 2151
## AAACGAACAGGCTACC-1 4431 1645
## AAACGAAGTACAAGCG-1 3762 1401
## AAACGAATCCTGCTAC-1 4522 1827
## AAACGAATCTCTAAGG-1 3328 738
## AAACGCTAGCTGGCTC-1 3411 903
# Create a vector storing your resolutions of interest
r.vector <- c(0.4, 0.8, 1.2)
# Set a function to visualize the clustering in each resolution setting
create.umap.fn <- function(obj, resol, split) {
# Find clusters
obj <- FindClusters(obj, resolution=resol)
# Assign the identity of the clustering (resolution=resol)
Idents(obj) <- paste0("integrated_snn_res.", as.character(resol))
# Visualize with a umap
DimPlot(obj,
reduction="umap",
label=T,
split.by=split,
label.size=5) + ggtitle(paste("UMAP by", split, "with Clustering"))
}
# Test clustering
create.umap.fn(integ.seurat, r.vector[1], Condition)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 48236
## Number of edges: 1570220
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9332
## Number of communities: 15
## Elapsed time: 11 seconds
create.umap.fn(integ.seurat, r.vector[2], Condition)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 48236
## Number of edges: 1570220
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9013
## Number of communities: 20
## Elapsed time: 10 seconds
create.umap.fn(integ.seurat, r.vector[3], Condition)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 48236
## Number of edges: 1570220
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8802
## Number of communities: 27
## Elapsed time: 9 seconds
# Determine your optimal resolution
final.resol <- r.vector[1] # Manually set
# Assign clusters to the seurat object
integ.seurat <- FindClusters(integ.seurat, resolution=final.resol)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 48236
## Number of edges: 1570220
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9332
## Number of communities: 15
## Elapsed time: 11 seconds
Idents(integ.seurat) <- paste0("integrated_snn_res.", as.character(final.resol))
head(integ.seurat@meta.data)
## orig.ident nCount_RNA nFeature_RNA seq.folder nUMI
## AAACGAAAGATCCCAT-1 SeuratProject 7783 2325 SeuratProject 7783
## AAACGAACAGGCTACC-1 SeuratProject 4723 1645 SeuratProject 4723
## AAACGAAGTACAAGCG-1 SeuratProject 3061 1401 SeuratProject 3061
## AAACGAATCCTGCTAC-1 SeuratProject 4844 1828 SeuratProject 4845
## AAACGAATCTCTAAGG-1 SeuratProject 1169 609 SeuratProject 1169
## AAACGCTAGCTGGCTC-1 SeuratProject 2063 823 SeuratProject 2064
## nGene log10GenesPerUMI mitoRatio Barcode Sample
## AAACGAAAGATCCCAT-1 2325 0.8651493 0.03880252 AAACGAAAGATCCCAT 1
## AAACGAACAGGCTACC-1 1645 0.8753335 0.03175947 AAACGAACAGGCTACC 1
## AAACGAAGTACAAGCG-1 1401 0.9026281 0.08428618 AAACGAAGTACAAGCG 1
## AAACGAATCCTGCTAC-1 1829 0.8851977 0.02930857 AAACGAATCCTGCTAC 1
## AAACGAATCTCTAAGG-1 609 0.9076876 0.05303678 AAACGAATCTCTAAGG 1
## AAACGCTAGCTGGCTC-1 824 0.8796931 0.11870155 AAACGCTAGCTGGCTC 1
## Tissue S.Score G2M.Score Phase
## AAACGAAAGATCCCAT-1 Atherosclerotic Core -0.01696573 -0.024925519 G1
## AAACGAACAGGCTACC-1 Atherosclerotic Core -0.03994105 -0.083265170 G1
## AAACGAAGTACAAGCG-1 Atherosclerotic Core -0.00665869 0.006498585 G2M
## AAACGAATCCTGCTAC-1 Atherosclerotic Core 0.02856264 0.014279768 S
## AAACGAATCTCTAAGG-1 Atherosclerotic Core -0.04324047 -0.014766047 G1
## AAACGCTAGCTGGCTC-1 Atherosclerotic Core -0.02271544 -0.024327827 G1
## nCount_SCT nFeature_SCT integrated_snn_res.0.4
## AAACGAAAGATCCCAT-1 4853 2151 9
## AAACGAACAGGCTACC-1 4431 1645 11
## AAACGAAGTACAAGCG-1 3762 1401 3
## AAACGAATCCTGCTAC-1 4522 1827 2
## AAACGAATCTCTAAGG-1 3328 738 3
## AAACGCTAGCTGGCTC-1 3411 903 11
## seurat_clusters
## AAACGAAAGATCCCAT-1 9
## AAACGAACAGGCTACC-1 11
## AAACGAAGTACAAGCG-1 3
## AAACGAATCCTGCTAC-1 2
## AAACGAATCTCTAAGG-1 3
## AAACGCTAGCTGGCTC-1 11
# Create a vector storing the cluster numbers
cl.vector <- as.numeric(unique(integ.seurat@meta.data$seurat_clusters)) - 1
# Extract the number of cells per cluster
nc <- FetchData(integ.seurat,
# "ident": cluster
# Condition: experimental group (e.g. Tissue, Treatment, Genotype, etc)
vars=c("ident", Condition)) %>% group_by(ident, Tissue) %>% count()
# Plot the number of cells per cluster
nc %>%
ggplot(aes(x=ident, y=n, fill=ident)) +
geom_bar(stat="identity") +
facet_grid(~Tissue) +
theme_bw() +
theme(strip.text.x=element_text(size=10),
legend.title=element_blank()) +
ggtitle("Number of Cells per Cluster") +
ylab("Number of Cells") +
xlab("Cluster")
# Plot the proportion of cells per cluster
nc %>%
ggplot(aes(x=Tissue, y=n, fill=ident)) +
geom_bar(stat="identity", color="black", position="fill") +
theme_bw() +
theme(legend.title=element_blank()) +
ggtitle("Proportion of Cells per Cluster") +
ylab("Proportion")
DimPlot(integ.seurat,
label=T,
split.by="Phase") + NoLegend()
# Set a function creating a feature plot
#
# order: positive cells above negative cells
# min.cutoff: threshold of shading. 'q10' indicates the bottom 10% of the cells will be colored completely grey
featurepl.fn <- function(options) {
FeaturePlot(integ.seurat,
reduction="umap",
order=T,
features=options,
min.cutoff="q10",
label=T)
}
# Determine metrics of potential sources of variation
# (e.g. mitoRatio, S.Score, G2M.Score, etc)
other.sources <- c("nUMI", "nGene")
# Plot the sources along with clusters
featurepl.fn(other.sources)
# Define a vector storing variables to extract from the seurat object
clm <- c(paste0("PC_", 1:14),
"ident",
"UMAP_1",
"UMAP_2")
# Extract and save as a data frame
pcs <- FetchData(integ.seurat, vars=clm) %>%
group_by(ident) %>%
gather("PC", "Score", starts_with("PC"))
# Explore the output
head(pcs)
## # A tibble: 6 x 5
## # Groups: ident [4]
## ident UMAP_1 UMAP_2 PC Score
## <fct> <dbl> <dbl> <chr> <dbl>
## 1 9 6.57 1.27 PC_1 7.17
## 2 11 9.60 -5.70 PC_1 14.6
## 3 3 2.58 -1.50 PC_1 7.92
## 4 2 7.73 -10.0 PC_1 11.1
## 5 3 4.49 0.0636 PC_1 4.40
## 6 11 9.37 -5.11 PC_1 16.7
# Create a data frame storing center locations of each cluster
umap.center <- pcs %>%
group_by(ident) %>%
summarize(x=mean(UMAP_1), y=mean(UMAP_2))
# Explore the output
head(umap.center)
## # A tibble: 6 x 3
## ident x y
## <fct> <dbl> <dbl>
## 1 0 -7.39 -0.657
## 2 1 -5.98 -4.01
## 3 2 7.65 -8.89
## 4 3 3.50 -0.429
## 5 4 5.33 10.4
## 6 5 4.31 -8.41
# Convert the character variable PC to factor
pcs$PC <- factor(pcs$PC, levels=paste0("PC_", 1:15))
# Plot the PC scores by cluster
ggplot(pcs, aes(x=UMAP_1, y=UMAP_2)) +
geom_point(alpha=0.2, aes(color=Score)) +
facet_wrap(~PC) +
theme_bw() +
scale_color_gradient(guide=F, low="grey90", high="blue") +
geom_text(data=umap.center,
aes(label=ident, x=x, y=y)) +
theme(strip.text.x=element_text(size=10)) +
ggtitle("PC Scores by Cluster")
# Extract 10 most significant genes in the PC 1 and 2
print(integ.seurat[["pca"]], dims=1:5, nfeatures=10)
## PC_ 1
## Positive: IGFBP7, SELENOM, ADIRF, MGP, CD151, BGN, CAVIN3, CALD1, TAGLN, DSTN
## Negative: CYBA, TYROBP, FCER1G, SRGN, AIF1, CTSS, HLA-DRA, ITGB2, B2M, CD74
## PC_ 2
## Positive: NPC2, AIF1, CST3, TYROBP, FCER1G, FTL, FCGRT, PSAP, CD14, HLA-DRB1
## Negative: IL32, RPS27, RPS29, CD3D, CD2, CD3E, RPL28, TSC22D3, CD3G, ZFP36L2
## PC_ 3
## Positive: RAMP2, ECSCR, CALCRL, PECAM1, CLEC14A, PALMD, ADGRL4, IFI27, NPDC1, PLVAP
## Negative: TAGLN, ACTA2, TPM2, MYL9, FXYD1, CALD1, S100A4, TPM1, SOD3, MFGE8
## PC_ 4
## Positive: MYH11, MYL9, ACTA2, PLN, TPM2, LMOD1, CNN1, DSTN, PPP1R14A, CSRP1
## Negative: LUM, COL1A2, CFH, DCN, COL3A1, COL1A1, CCDC80, C1R, PCOLCE, SFRP2
## PC_ 5
## Positive: C1QB, C1QA, C1QC, SELENOP, HLA-DQA1, FOLR2, HLA-DPA1, HLA-DPB1, HLA-DRA, CD74
## Negative: SPP1, FABP5, SH3BGRL3, S100A10, CD36, SMIM25, FTH1, C15orf48, FBP1, MARCO
# Reset the default assay to "RNA"
DefaultAssay(integ.seurat) <- "RNA"
# Renormalize the data
integ.seurat <- NormalizeData(integ.seurat)
# Explore endothelial cell markers
# ENSG00000261371: CD31 (PECAM1)
# ENSG00000090339: ICAM1
ec.genes <- c("PECAM1", "ICAM1")
featurepl.fn(ec.genes)
# Explore smooth muscle cell markers
# ENSG00000107796: ACTA2
# ENSG00000133392: MYH11 (SMMHC)
smc.genes <- c("ACTA2", "MYH11")
featurepl.fn(smc.genes)
# Explore T cell markers
# ENSG00000110848: CD69 (early marker)
# ENSG00000010610: CD4
# ENSG00000153563: CD8
t.genes <- c("CD69",
"CD4",
"CD8A")
featurepl.fn(t.genes)
# Explore B cell markers
# ENSG00000177455: CD19
# ENSG00000081237: B220 (PTPRC)
b.genes <- c("CD19", "PTPRC")
featurepl.fn(b.genes)
# Explore macrophage markers
# ENSG00000121594: CD36
# ENSG00000114013: CD86
# ENSG00000129226: CD68
m.genes <- c("CD36", "CD86", "CD68")
featurepl.fn(m.genes)
# Run only if your dataset has a single group/condition
# markers <- FindAllMarkers(integ.seurat, only.pos=T, logfc.threshold=0.25)
#
# only.pos: keeps only positive changes
#
# logfc.threshold: min log2 foldchange for mean expression in this cluster vs other clusters. Default is 0.25
#
# min.diff.pct: min percent difference in cell population expressing the gene in this cluster vs all other clusters combined
#
# min.pct: min fraction of cell population expressing the gene in either of the clusters
# Reset default assays to "RNA"
DefaultAssay(integ.seurat) <- "RNA"
# Renormalize the data
integ.seurat <- NormalizeData(integ.seurat)
# Explore the clusters by Condition
cluster.stat <- table(integ.seurat@meta.data$seurat_clusters,
integ.seurat@meta.data$Tissue) %>% as.data.frame()
names(cluster.stat) <- c("Cluster", Condition, "N")
head(cluster.stat)
## Cluster Tissue N
## 1 0 Atherosclerotic Core 1660
## 2 1 Atherosclerotic Core 1041
## 3 2 Atherosclerotic Core 1964
## 4 3 Atherosclerotic Core 2468
## 5 4 Atherosclerotic Core 542
## 6 5 Atherosclerotic Core 851
# Extract cluster numbers having zero cells
cluster.stat.zero <- subset(cluster.stat, N == 0)$Cluster
# Set a function finding conserved markers and converting the output to a data frame
find.cmarkers.fn <- function(obj, cluster) {
marker.output <- FindConservedMarkers(obj,
ident.1=cluster,
grouping.var=Condition,
only.pos=T,
logfc.threshold=0.25) %>%
rownames_to_column("Gene") %>%
as.data.frame() %>%
mutate(Cluster=cluster)
}
# Find conserved markers in the cluster 0
cmarkers.df <- find.cmarkers.fn(integ.seurat, 0)
dim(cmarkers.df)
## [1] 597 14
head(cmarkers.df)
## Gene Atherosclerotic Core_p_val Atherosclerotic Core_avg_log2FC
## 1 CD2 0 2.122994
## 2 RGS1 0 2.065577
## 3 PTPRC 0 1.810517
## 4 ZFP36L2 0 1.631484
## 5 CD8A 0 1.901262
## 6 CD8B 0 1.601038
## Atherosclerotic Core_pct.1 Atherosclerotic Core_pct.2
## 1 0.744 0.143
## 2 0.451 0.101
## 3 0.898 0.326
## 4 0.912 0.799
## 5 0.404 0.037
## 6 0.319 0.029
## Atherosclerotic Core_p_val_adj Proximal Adjacent_p_val
## 1 0 0.00000e+00
## 2 0 6.41786e-163
## 3 0 0.00000e+00
## 4 0 0.00000e+00
## 5 0 0.00000e+00
## 6 0 0.00000e+00
## Proximal Adjacent_avg_log2FC Proximal Adjacent_pct.1 Proximal Adjacent_pct.2
## 1 1.9466847 0.777 0.244
## 2 0.8991752 0.425 0.319
## 3 1.4455357 0.902 0.599
## 4 1.5679094 0.895 0.740
## 5 1.5779570 0.322 0.046
## 6 1.4339350 0.267 0.035
## Proximal Adjacent_p_val_adj max_pval minimump_p_val Cluster
## 1 0.000000e+00 0.00000e+00 0 0
## 2 1.319576e-158 6.41786e-163 0 0
## 3 0.000000e+00 0.00000e+00 0 0
## 4 0.000000e+00 0.00000e+00 0 0
## 5 0.000000e+00 0.00000e+00 0 0
## 6 0.000000e+00 0.00000e+00 0 0
# Find conserved markers in the rest of the clusters iteratively
# NOTE: skip clusters having zero cells!!
i <- 1
while (i <= max(cl.vector)) {
if (i %in% cluster.stat.zero) {
i <- i + 1
} else {
rest.df <- find.cmarkers.fn(integ.seurat, i)
cmarkers.df <- rbind(cmarkers.df, rest.df)
i <- i + 1
}
}
# Explore the updated conserved markers data frame
dim(cmarkers.df)
## [1] 9762 14
head(cmarkers.df)
## Gene Atherosclerotic Core_p_val Atherosclerotic Core_avg_log2FC
## 1 CD2 0 2.122994
## 2 RGS1 0 2.065577
## 3 PTPRC 0 1.810517
## 4 ZFP36L2 0 1.631484
## 5 CD8A 0 1.901262
## 6 CD8B 0 1.601038
## Atherosclerotic Core_pct.1 Atherosclerotic Core_pct.2
## 1 0.744 0.143
## 2 0.451 0.101
## 3 0.898 0.326
## 4 0.912 0.799
## 5 0.404 0.037
## 6 0.319 0.029
## Atherosclerotic Core_p_val_adj Proximal Adjacent_p_val
## 1 0 0.00000e+00
## 2 0 6.41786e-163
## 3 0 0.00000e+00
## 4 0 0.00000e+00
## 5 0 0.00000e+00
## 6 0 0.00000e+00
## Proximal Adjacent_avg_log2FC Proximal Adjacent_pct.1 Proximal Adjacent_pct.2
## 1 1.9466847 0.777 0.244
## 2 0.8991752 0.425 0.319
## 3 1.4455357 0.902 0.599
## 4 1.5679094 0.895 0.740
## 5 1.5779570 0.322 0.046
## 6 1.4339350 0.267 0.035
## Proximal Adjacent_p_val_adj max_pval minimump_p_val Cluster
## 1 0.000000e+00 0.00000e+00 0 0
## 2 1.319576e-158 6.41786e-163 0 0
## 3 0.000000e+00 0.00000e+00 0 0
## 4 0.000000e+00 0.00000e+00 0 0
## 5 0.000000e+00 0.00000e+00 0 0
## 6 0.000000e+00 0.00000e+00 0 0
# Remove spaces in the column names
column.names <- colnames(cmarkers.df)
column.names <- str_replace(column.names, " ", "_")
names(cmarkers.df) <- column.names
head(cmarkers.df)
## Gene Atherosclerotic_Core_p_val Atherosclerotic_Core_avg_log2FC
## 1 CD2 0 2.122994
## 2 RGS1 0 2.065577
## 3 PTPRC 0 1.810517
## 4 ZFP36L2 0 1.631484
## 5 CD8A 0 1.901262
## 6 CD8B 0 1.601038
## Atherosclerotic_Core_pct.1 Atherosclerotic_Core_pct.2
## 1 0.744 0.143
## 2 0.451 0.101
## 3 0.898 0.326
## 4 0.912 0.799
## 5 0.404 0.037
## 6 0.319 0.029
## Atherosclerotic_Core_p_val_adj Proximal_Adjacent_p_val
## 1 0 0.00000e+00
## 2 0 6.41786e-163
## 3 0 0.00000e+00
## 4 0 0.00000e+00
## 5 0 0.00000e+00
## 6 0 0.00000e+00
## Proximal_Adjacent_avg_log2FC Proximal_Adjacent_pct.1 Proximal_Adjacent_pct.2
## 1 1.9466847 0.777 0.244
## 2 0.8991752 0.425 0.319
## 3 1.4455357 0.902 0.599
## 4 1.5679094 0.895 0.740
## 5 1.5779570 0.322 0.046
## 6 1.4339350 0.267 0.035
## Proximal_Adjacent_p_val_adj max_pval minimump_p_val Cluster
## 1 0.000000e+00 0.00000e+00 0 0
## 2 1.319576e-158 6.41786e-163 0 0
## 3 0.000000e+00 0.00000e+00 0 0
## 4 0.000000e+00 0.00000e+00 0 0
## 5 0.000000e+00 0.00000e+00 0 0
## 6 0.000000e+00 0.00000e+00 0 0
# Create a vector storing mean log2FC across the conditions
log2FC.columns <- column.names[str_detect(column.names, "log2FC")]
mean.log2FC <- rowMeans(cmarkers.df[, log2FC.columns])
# Combine the calculated mean log2FC with the conserved marker data frame
cmarkers.df <- cbind(cmarkers.df, mean_log2FC = mean.log2FC)
# Extract top 10 conserved markers by cluster
top.cmarkers <- cmarkers.df %>%
group_by(Cluster) %>%
top_n(n=10, wt=mean_log2FC)
head(top.cmarkers)
## # A tibble: 6 x 15
## # Groups: Cluster [1]
## Gene Atherosclerotic_C… Atherosclerotic_… Atherosclerotic_… Atherosclerotic_…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CD2 0 2.12 0.744 0.143
## 2 DUSP2 0 2.63 0.748 0.224
## 3 CXCR4 0 2.34 0.882 0.283
## 4 GZMK 0 3.69 0.692 0.051
## 5 GZMA 0 2.48 0.763 0.103
## 6 TRBC2 0 2.06 0.701 0.158
## # … with 10 more variables: Atherosclerotic_Core_p_val_adj <dbl>,
## # Proximal_Adjacent_p_val <dbl>, Proximal_Adjacent_avg_log2FC <dbl>,
## # Proximal_Adjacent_pct.1 <dbl>, Proximal_Adjacent_pct.2 <dbl>,
## # Proximal_Adjacent_p_val_adj <dbl>, max_pval <dbl>, minimump_p_val <dbl>,
## # Cluster <dbl>, mean_log2FC <dbl>
# Visualize cells the top 10 genes using featurepl.fn by cluster
top.cmarkers <- top.cmarkers %>%
group_by(Cluster) %>%
nest() %>%
mutate(feature_plot=map(data, ~ featurepl.fn(.x$Gene)))
for (j in top.cmarkers$Cluster) {
df1 <- subset(top.cmarkers, Cluster == j)
df2 <- df1 %>% unnest(data)
print(paste0("Cluster ", j, ":"))
print(df2$Gene)
print(df1$feature_plot)
}
## [1] "Cluster 0:"
## [1] "CD2" "DUSP2" "CXCR4" "GZMK" "GZMA" "TRBC2" "CD69" "IL32" "CCL5"
## [10] "CCL4"
## [[1]]
##
## [1] "Cluster 1:"
## [1] "CD2" "IL7R" "LTB" "TRBC2" "BTG1" "TRAC" "IL32" "CCR7" "SARAF"
## [10] "TRBC1"
## [[1]]
##
## [1] "Cluster 2:"
## [1] "CCDC80" "SFRP2" "COL1A2" "GSN" "C1R" "MGP" "IGFBP6" "LUM"
## [9] "DCN" "CCL19"
## [[1]]
##
## [1] "Cluster 3:"
## [1] "RBP7" "ACKR1" "CALCRL" "RAMP3" "GNG11" "IFI27" "IGFBP4" "RAMP2"
## [9] "SOX18" "PLVAP"
## [[1]]
##
## [1] "Cluster 4:"
## [1] "C1QA" "C1QC" "C1QB" "F13A1" "FOLR2" "CCL18"
## [7] "CCL3" "HLA-DRA" "SELENOP" "HLA-DPA1"
## [[1]]
##
## [1] "Cluster 5:"
## [1] "CALD1" "TPM2" "C11orf96" "TAGLN" "ACTA2" "TPM1"
## [7] "MYH11" "DSTN" "MYL9" "PPP1R14A"
## [[1]]
##
## [1] "Cluster 6:"
## [1] "APOC1" "CD68" "SPP1" "TYROBP" "FTH1" "FTL" "CTSB" "FABP5"
## [9] "CSTB" "FABP4"
## [[1]]
##
## [1] "Cluster 7:"
## [1] "CTSS" "S100A9" "S100A8" "MNDA" "IL1B" "CXCL8" "AIF1" "FCN1"
## [9] "LYZ" "AREG"
## [[1]]
##
## [1] "Cluster 8:"
## [1] "XCL1" "GNLY" "FGFBP2" "CTSW" "PRF1" "KLRD1" "GZMB" "CCL4"
## [9] "NKG7" "XCL2"
## [[1]]
##
## [1] "Cluster 9:"
## [1] "EDN1" "CXCL2" "CLEC14A" "VWF" "TM4SF1" "CLU" "RAMP2"
## [8] "CRTAC1" "ID1" "IGFBP3"
## [[1]]
##
## [1] "Cluster 10:"
## [1] "IGKC" "CD83" "MS4A1" "IGHA1" "IGHM" "CD79A" "IGLC2" "IGHG1" "IGLC3"
## [10] "IGHG2"
## [[1]]
##
## [1] "Cluster 11:"
## [1] "FRZB" "IGFBP2" "RAMP1" "SUCNR1" "PDE5A" "DLX6-AS1"
## [7] "DLX5" "OGN" "SOST" "PTN"
## [[1]]
##
## [1] "Cluster 12:"
## [1] "RHEX" "CPA3" "HPGDS" "HPGD" "MS4A2" "CTSG" "HDC" "TPSB2"
## [9] "TPSAB1" "AREG"
## [[1]]
##
## [1] "Cluster 13:"
## [1] "HIST1H1B" "MKI67" "NUSAP1" "TYMS" "CENPF" "STMN1"
## [7] "HMGB2" "TUBA1B" "HIST1H1D" "HIST1H4C"
## [[1]]
##
## [1] "Cluster 14:"
## [1] "TCL1A" "JCHAIN" "PLD4" "GZMB" "CLIC3" "PLAC8" "IRF8"
## [8] "PTGDS" "IRF7" "HERPUD1"
## [[1]]
# Re-assign cell identity
integ.seurat <- RenameIdents(integ.seurat,
"0"="CTL/NK",
"1"="T",
"2"="Fibroblast",
"3"="EC",
"4"="DC/Macrophage",
"5"="SMC",
"6"="Macrophage",
"7"="Neutrophil/Monocyte",
"8"="CTL/NK",
"9"="EC",
"10"="B",
"11"="Unclassified",
"12"="Mast cell/Basophil",
"13"="Proliferating",
"14"="Unclassified")
# Plot the UMAP with the cell types
DimPlot(integ.seurat,
reduction="umap",
label=T,
label.size=5,
repel=T) + ggtitle("Cell Type")
# Save the R object
write_rds(integ.seurat, "integ.seurat_label.rds")
# Create a text filing storing my session info
sink("sessionInfo_scRNAseq_v1.txt")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/scrna/lib/libopenblasp-r0.3.12.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ensembldb_2.14.0 AnnotationFilter_1.14.0 GenomicFeatures_1.42.1
## [4] AnnotationDbi_1.52.0 Biobase_2.50.0 GenomicRanges_1.42.0
## [7] GenomeInfoDb_1.26.2 IRanges_2.24.1 S4Vectors_0.28.1
## [10] cowplot_1.1.1 AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [13] dbplyr_2.1.0 BiocGenerics_0.36.0 SeuratObject_4.0.0
## [16] Seurat_4.0.0 data.table_1.14.0 ggrepel_0.9.1
## [19] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4
## [22] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [25] tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0
## [28] Matrix_1.3-2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.1.4 reticulate_1.18
## [3] tidyselect_1.1.0 RSQLite_2.2.3
## [5] htmlwidgets_1.5.3 grid_4.0.3
## [7] BiocParallel_1.24.1 Rtsne_0.15
## [9] munsell_0.5.0 mutoss_0.1-12
## [11] codetools_0.2-18 ica_1.0-2
## [13] future_1.21.0 miniUI_0.1.1.1
## [15] withr_2.4.1 colorspace_2.0-0
## [17] highr_0.8 knitr_1.31
## [19] rstudioapi_0.13 ROCR_1.0-11
## [21] tensor_1.5 listenv_0.8.0
## [23] Rdpack_2.1.1 labeling_0.4.2
## [25] MatrixGenerics_1.2.1 GenomeInfoDbData_1.2.4
## [27] mnormt_2.0.2 polyclip_1.10-0
## [29] farver_2.0.3 bit64_4.0.5
## [31] TH.data_1.0-10 parallelly_1.23.0
## [33] vctrs_0.3.6 generics_0.1.0
## [35] xfun_0.21 R6_2.5.0
## [37] bitops_1.0-6 spatstat.utils_2.0-0
## [39] cachem_1.0.4 DelayedArray_0.16.1
## [41] assertthat_0.2.1 promises_1.2.0.1
## [43] scales_1.1.1 multcomp_1.4-16
## [45] gtable_0.3.0 globals_0.14.0
## [47] goftest_1.2-2 sandwich_3.0-0
## [49] rlang_0.4.10 splines_4.0.3
## [51] rtracklayer_1.50.0 lazyeval_0.2.2
## [53] broom_0.7.5 BiocManager_1.30.10
## [55] yaml_2.2.1 reshape2_1.4.4
## [57] abind_1.4-5 modelr_0.1.8
## [59] backports_1.2.1 httpuv_1.5.5
## [61] tools_4.0.3 ellipsis_0.3.1
## [63] jquerylib_0.1.3 RColorBrewer_1.1-2
## [65] ggridges_0.5.3 TFisher_0.2.0
## [67] Rcpp_1.0.6 plyr_1.8.6
## [69] progress_1.2.2 zlibbioc_1.36.0
## [71] RCurl_1.98-1.2 ps_1.5.0
## [73] prettyunits_1.1.1 rpart_4.1-15
## [75] openssl_1.4.3 deldir_0.2-10
## [77] pbapply_1.4-3 zoo_1.8-8
## [79] SummarizedExperiment_1.20.0 haven_2.3.1
## [81] cluster_2.1.1 fs_1.5.0
## [83] magrittr_2.0.1 RSpectra_0.16-0
## [85] scattermore_0.7 lmtest_0.9-38
## [87] reprex_1.0.0 RANN_2.6.1
## [89] tmvnsim_1.0-2 mvtnorm_1.1-1
## [91] ProtGenerics_1.22.0 fitdistrplus_1.1-3
## [93] matrixStats_0.58.0 hms_1.0.0
## [95] patchwork_1.1.1 mime_0.10
## [97] evaluate_0.14 xtable_1.8-4
## [99] XML_3.99-0.5 readxl_1.3.1
## [101] gridExtra_2.3 compiler_4.0.3
## [103] biomaRt_2.46.3 KernSmooth_2.23-18
## [105] crayon_1.4.1 htmltools_0.5.1.1
## [107] mgcv_1.8-34 later_1.1.0.1
## [109] lubridate_1.7.9.2 DBI_1.1.1
## [111] MASS_7.3-53.1 rappdirs_0.3.3
## [113] cli_2.3.1 rbibutils_2.0
## [115] metap_1.4 igraph_1.2.6
## [117] pkgconfig_2.0.3 sn_1.6-2
## [119] GenomicAlignments_1.26.0 numDeriv_2016.8-1.1
## [121] plotly_4.9.3 xml2_1.3.2
## [123] bslib_0.2.4 multtest_2.46.0
## [125] XVector_0.30.0 rvest_0.3.6
## [127] digest_0.6.27 sctransform_0.3.2
## [129] RcppAnnoy_0.0.18 spatstat.data_2.0-0
## [131] Biostrings_2.58.0 rmarkdown_2.7
## [133] cellranger_1.1.0 leiden_0.3.7
## [135] uwot_0.1.10 curl_4.3
## [137] shiny_1.6.0 Rsamtools_2.6.0
## [139] lifecycle_1.0.0 nlme_3.1-152
## [141] jsonlite_1.7.2 limma_3.46.0
## [143] viridisLite_0.3.0 askpass_1.1
## [145] fansi_0.4.2 pillar_1.5.0
## [147] lattice_0.20-41 plotrix_3.8-1
## [149] fastmap_1.1.0 httr_1.4.2
## [151] survival_3.2-7 interactiveDisplayBase_1.28.0
## [153] glue_1.4.2 spatstat_1.64-1
## [155] png_0.1-7 BiocVersion_3.12.0
## [157] bit_4.0.4 stringi_1.5.3
## [159] sass_0.3.1 blob_1.2.1
## [161] memoise_2.0.0 mathjaxr_1.2-0
## [163] irlba_2.3.3 future.apply_1.7.0
sink()